Comparing numerical data across groups
Contents
Comparing numerical data across groups#
Setup#
import pandas as pd
import altair as alt
DataTransformerRegistry.enable('default')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()
Data#
Import data#
ROOT = "https://raw.githubusercontent.com/kirenz/datasets/master/"
DATA = "county.csv"
df = pd.read_csv(ROOT + DATA)
# Select only relevant variables
data_selection = ["state", "name", "pop_change",
"population_change", "median_hh_income", "metro"]
df = ___
Data corrections#
# drop missing values
df.dropna(inplace=True)
# rename variable population_change to change (use: inplace=True)
df.rename(columns={'___': '___'}, ___=___)
# change data type to category
df['change'] = df['change']___
Analysis#
# count the values
df['change'].___
no gain 1285
gain 1275
Name: change, dtype: int64
df['metro'].___
no 1615
yes 945
Name: metro, dtype: int64
Histogram for two groups#
# use median_hh_income and change as color
___.___(___).___().___(
___=___.___("___",
bin=alt.BinParams(maxbins=___)),
___=___.___('___'),
color=___.___('___')
)
Side-by-side box plot#
alt.Chart(df).mark_boxplot().encode(
x=alt.X('median_hh_income'),
y=alt.Y('change'),
color=alt.Color('change'),
).properties(
width=400,
height=150,
)
Faceting#
alt.Chart(df).mark_bar().encode(
alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
alt.Y('count()'),
alt.Column('metro'), # <--
alt.Row('change'), # <--
).properties(
width=200,
height=100,
)
alt.Chart(df).mark_bar().encode(
x=alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
y=alt.Y('count()'),
).properties(
width=200,
height=100,
).facet( # <--
column='metro',
row='change',
)
Pair plots#
alt.Chart(df).mark_circle().encode(
x=alt.X(alt.repeat("column"), type='quantitative'),
y=alt.Y(alt.repeat("row"), type='quantitative'),
color=alt.Color('change'),
).properties(
width=150,
height=150
).repeat(
row=['pop_change', 'median_hh_income'],
column=['median_hh_income', 'pop_change']
).interactive()